In [1]:
import numpy as np  
print(f"NumPy version: {np.__version__}")
print(f"NumPy path: {np.__path__}")

import pandas as pd
print(f"Pandas version: {pd.__version__}")
print(f"Pandas path: {pd.__path__}")
NumPy version: 1.24.2
NumPy path: ['C:\\Users\\ddj6tu\\AppData\\Roaming\\Python\\Python311\\site-packages\\numpy']
C:\Users\ddj6tu\AppData\Roaming\Python\Python311\site-packages\pandas\core\arrays\masked.py:60: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).
  from pandas.core import (
Pandas version: 2.2.3
Pandas path: ['C:\\Users\\ddj6tu\\AppData\\Roaming\\Python\\Python311\\site-packages\\pandas']
In [2]:
import seaborn as sns
import nltk
import re
from glob import glob
import matplotlib.pyplot as plt

LIB¶

In [3]:
source_file_list = sorted(glob("C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\*.*"))
In [4]:
source_file_list
Out[4]:
['C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_AsteroidCity.txt',
 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_BottleRocket.txt',
 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_FrenchDispatch.txt',
 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_GrandBudapestHotel.txt',
 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_MoonriseKingdom.txt',
 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_RoyalTennenbaums.txt',
 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\ANDERSON_Rushmore.txt',
 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\salex_nrc.csv']
In [5]:
script_id = list(range(1,8))
In [6]:
years = [2023, 1996, 2021, 2014, 2012, 2001, 1998]
In [7]:
eras = ['late', 'early', 'late', 'middle', 'middle', 'early', 'early']
In [8]:
titles = ['Asteroid City', 'Bottle Rocket', 'French Dispatch', 'Grand Budapest Hotel', 'Moonrise Kingdom', 'Royal Tennenbaums', 'Rushmore']
In [9]:
LIB = pd.DataFrame(list(zip(script_id, titles, years, eras, source_file_list)), 
                  columns=['script_id', 'title', 'years', 'era', 'source']).set_index('script_id').sort_index()
In [10]:
LIB
Out[10]:
title years era source
script_id
1 Asteroid City 2023 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_...
2 Bottle Rocket 1996 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_...
3 French Dispatch 2021 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_...
4 Grand Budapest Hotel 2014 middle C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_...
5 Moonrise Kingdom 2012 middle C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_...
6 Royal Tennenbaums 2001 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_...
7 Rushmore 1998 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_...
In [11]:
roman = '[IVXLCM]+'
caps = "[A-Z';, -]+"
ohco_pat_list = [
    (1,   r"^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:|INSERT:|CUT TO:|MONTAGE:)"),
    (2,   r"^\s*(EXT\.|INT\.|EXT/INT\.)"),
    (3,   r"^\s*(Obituary|EXT\.|CUT TO:|MONTAGE:|In the File Room:|INT\.|Sketchbook|SPLIT-SCREEN:|Story #1|TITLE:|Story #2|INSERT:|INT/EXT\.|Split-screen:)"),
    (4,   r"^\s*(EXT\.|INT\.|MONTAGE:|CUT TO:|INSERT:|TITLE:)"),
    (5,   r"^\s*(INT\.|EXT\.|TITLES OVER:|CUT TO:|INSERT:|MONTAGE:|CUT TO:|TITLE:)"),
    (6,   r"^\s*(INSERT:|CUT TO:|INT\.|EXT\.|MONTAGE:)"),
    (7,   r"^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARCH MONTAGE:| OCTOBER MONTAGE:|THANKSGIVING MONTAGE:|DECEMBER MONTAGE:)")
]
In [12]:
LIB['scene_regex'] = LIB.index.map(pd.Series({x[0]:x[1] for x in ohco_pat_list}))
In [13]:
LIB
Out[13]:
title years era source scene_regex
script_id
1 Asteroid City 2023 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:...
2 Bottle Rocket 1996 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(EXT\.|INT\.|EXT/INT\.)
3 French Dispatch 2021 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(Obituary|EXT\.|CUT TO:|MONTAGE:|In the Fi...
4 Grand Budapest Hotel 2014 middle C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(EXT\.|INT\.|MONTAGE:|CUT TO:|INSERT:|TITLE:)
5 Moonrise Kingdom 2012 middle C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|EXT\.|TITLES OVER:|CUT TO:|INSERT:|...
6 Royal Tennenbaums 2001 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INSERT:|CUT TO:|INT\.|EXT\.|MONTAGE:)
7 Rushmore 1998 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC...

Corpus¶

In [14]:
clip_pats = [
        r'START OF SCRIPT', 
        r'END OF SCRIPT'
    ]
In [15]:
import pandas as pd
import numpy as np
import nltk

class TextParser():
    """
    A class to parse a single Gutenberg-type text file into a TOKENS dataframe with
    an OHCO index. Also has methods to extract a VOCAB table, although vocabulary
    tables ought to be generated at the corpus level.
    
    Sample parameter values:

    ohco_pats = [
        ('chapter', r"^\s*(chapter|letter)\s+(\d+)", 'm')    
    ]

    clip_pats = [
        r'START OF GUTENBERG PROJECT', 
        r'^\s*THE END'
    ]

    """

    # TODO: Make these private
    src_imported:bool = False       
    src_clipped:bool = False
    src_col_suffix:str ='_str'

    join_pat:str = r'\n'
    strip_hyphens:bool = False
    strip_whitespace:bool = False
    verbose:bool = False

    stanford_pos_model:str = "english-bidirectional-distsim.tagger"
    stanford_pos_model_path = None
        
    # We assume all OHCOs have sentences and tokens
    # and that there are terminal in the list.
    ohco_pats:[] = [
        ('para', r"\n\n", 'd'),
        ('sent', r"[.?!;:]+", 'd'),
        ('token', r"[\s',-]+", 'd')
    ]
        
    _ohco_type:{} = {
        'd': '_num',
        'm': '_id'
    }
        
    def __init__(self, src_file:str, ohco_pats:[], clip_pats:[], use_nltk=True):
        """Initialize the object and extract config info. If using NLTK, download resources."""
        self.src_file = src_file            
        self.clip_pats = clip_pats # TODO: Validate
        self.ohco_pats = ohco_pats + self.ohco_pats # TODO: Validate
        self.OHCO = [item[0]+self._ohco_type[item[2]] for item in self.ohco_pats]
        self.ohco_names = [item[0] for item in self.ohco_pats]
        self.use_nltk = use_nltk

        if self.use_nltk:
            # Override the last two OHCO items
            self.ohco_pats[-2] = ('sent', None, 'nltk')
            self.ohco_pats[-1] = ('token', None, 'nltk')
            # Make sure you have the NLTK stuff
            for package in [
                'tokenizers/punkt', 
                'taggers/averaged_perceptron_tagger', 
                'corpora/stopwords', 
                'help/tagsets'
            ]:
                if self.verbose: print("Checking", package)
                try:
                    nltk.data.find(package)
                except IndexError:
                    nltk.download(package)
            
    def import_source(self, strip:bool=True, char_encoding:str="utf-8-sig"):
        """Convert a raw text file into a dataframe of lines."""
        if self.verbose: print("Importing ", self.src_file)
        text_lines = open(self.src_file,'r', encoding=char_encoding).readlines()
        self.LINES = pd.DataFrame({'line_str':text_lines})
        self.LINES.index.name = 'line_id'
        if strip:
            self.LINES.line_str = self.LINES.line_str.str.strip()
        self.src_imported = True
        if self.verbose: print("Clipping text")
        self._clip_lines()
        return self        

    def _clip_lines(self):
        """Remove cruft lines from beginning and/or end of file."""
        start_pat = self.clip_pats[0]
        end_pat = self.clip_pats[1]
        start = self.LINES.line_str.str.contains(start_pat, regex=True)
        end = self.LINES.line_str.str.contains(end_pat, regex=True)
        try:
            start_line_num = self.LINES.loc[start].index[0]
        except IndexError:
            raise ValueError("Clip start pattern not found.")            
        try:
            end_line_num = self.LINES.loc[end].index[0]
        except IndexError:
            raise ValueError("Clip end pattern not found.")
        self.LINES = self.LINES.loc[start_line_num + 1 : end_line_num - 2]
        self.src_clipped == True
        
    def parse_tokens(self):
        """Convert lines to tokens based on OHCO."""
        if self.src_imported:

            # Start with the LINES df
            self.TOKENS = self.LINES.copy()

            # Walk through each level of the OHCO to build out TOKENS
            for i, level in enumerate(self.OHCO):

                if self.verbose: print(f"Parsing OHCO level {i} {level}", end=' ')

                # Define level-specific variables
                parse_type = self.ohco_pats[i][2]
                div_name = self.ohco_pats[i][0]
                div_pat = self.ohco_pats[i][1]
                if i == 0:
                    src_div_name = 'line'
                else:
                    src_div_name = self.ohco_names[i - 1] 
                src_col = f"{src_div_name}{self.src_col_suffix}"
                dst_col = f"{div_name}{self.src_col_suffix}"

                # By Milestone
                if parse_type == 'm':
                    if self.verbose: print(f"by milestone {div_pat}")
                    div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
                    self.TOKENS.loc[div_lines, div_name] = [i+1 for i in range(self.TOKENS.loc[div_lines].shape[0])]
                    self.TOKENS[div_name] = self.TOKENS[div_name].ffill()
                    self.TOKENS = self.TOKENS.loc[~self.TOKENS[div_name].isna()] 
                    self.TOKENS = self.TOKENS.loc[~div_lines] 
                    self.TOKENS[div_name] = self.TOKENS[div_name].astype('int')
                    self.TOKENS = self.TOKENS.groupby(self.ohco_names[:i+1], group_keys=True)[src_col]\
                        .apply(lambda x: '\n'.join(x)).to_frame(dst_col)

                    # print(self.TOKENS[dst_col].str.count(r'\n\n'))
                    print(src_col, dst_col)
                    print(self.TOKENS.columns)


                # By Delimitter
                elif parse_type == 'd':
                    if self.verbose: print(f"by delimitter {div_pat}")
                    self.TOKENS = self.TOKENS[src_col].str.split(div_pat, expand=True).stack().to_frame(dst_col)
                
                # By NLTK 
                elif parse_type == 'nltk':
                    if self.verbose: print(f"by NLTK model")

                    if level == 'sent_num':
                        self.TOKENS = self.TOKENS.para_str\
                                .apply(lambda x: pd.Series(nltk.sent_tokenize(x), dtype='string'))\
                                .stack()\
                                .to_frame('sent_str')
                    
                    if level == 'token_num':
                        if self.strip_hyphens == True:
                            self.TOKENS.sent_str = self.TOKENS.sent_str.str.replace(r"-", ' ')
                        if self.strip_whitespace == True:
                            self.TOKENS = self.TOKENS.sent_str\
                                    .apply(lambda x: pd.Series(
                                            nltk.pos_tag(nltk.WhitespaceTokenizer().tokenize(x)),
                                            dtype='object'
                                        )
                                    )
                        else:
                            self.TOKENS = self.TOKENS.sent_str\
                                    .apply(lambda x: pd.Series(nltk.pos_tag(nltk.word_tokenize(x))))
                        self.TOKENS = self.TOKENS.stack().to_frame('pos_tuple')
                        self.TOKENS['pos'] = self.TOKENS.pos_tuple.apply(lambda x: x[1])
                        self.TOKENS['token_str'] = self.TOKENS.pos_tuple.apply(lambda x: x[0])
                        self.TOKENS['term_str'] = self.TOKENS.token_str.str.lower()   
        
                else:
                    raise ValueError(f"Invalid parse option: {parse_type}.")

                # After creating the current OHCO level
                self.TOKENS.index.names = self.OHCO[:i+1]

            # After iterating through the OHCO

            if not self.use_nltk:
                self.TOKENS['term_str'] = self.TOKENS.token_str.str.replace(r'[\W_]+', '', regex=True).str.lower()  
            else:
                punc_pos = ['$', "''", '(', ')', ',', '--', '.', ':', '``']
                self.TOKENS['term_str'] = self.TOKENS[~self.TOKENS.pos.isin(punc_pos)].token_str\
                    .str.replace(r'[\W_]+', '', regex=True).str.lower()  
            
        else:
            raise RuntimeError("Source not imported. Please run .import_source()")

    def extract_vocab(self):
        """This should also be done at the corpus level."""
        self.VOCAB = self.TOKENS.term_str.value_counts().to_frame('n')
        self.VOCAB.index.name = 'term_str'
        self.VOCAB['n_chars'] = self.VOCAB.index.str.len()
        self.VOCAB['p'] = self.VOCAB['n'] / self.VOCAB['n'].sum()
        self.VOCAB['s'] = 1 / self.VOCAB['p']
        self.VOCAB['i'] = np.log2(self.VOCAB['s']) # Same as negative log probability (i.e. log likelihood)
        self.VOCAB['h'] = self.VOCAB['p'] * self.VOCAB['i']
        self.H = self.VOCAB['h'].sum()
        return self

    def annotate_vocab(self):
        """This should be done at the corpus level."""
        # Stopwords
        # Max POS
        # POS variability
        # Porter Stems
        pass

    def extract_pos_data(self):
        # TODO: Create dataframe for POS info, including Penn Treebank info
        pass

    def extract_named_entities(self):
        # TODO: Create dataframe of named entities
        pass

    def gather_tokens(self, level=0, grouping_col='term_str', cat_sep=' '):
        """Gather tokens into strings for arbitrary OHCO level."""
        max_level = len(self.OHCO) - 2 # Can't gather tokens at the token level :)
        if level > max_level:
            raise ValueError(f"Level {level} too high. Try between 0 and {max_level}")
        else:
            level_name = self.OHCO[level].split('_')[0]
            idx = self.TOKENS.index.names[:level+1]
            return self.TOKENS.groupby(idx)[grouping_col].apply(lambda x: x.str.cat(sep=cat_sep))\
                .to_frame(f'{level_name}_str')


if __name__ == '__main__':
    pass
In [16]:
def tokenize_collection(LIB):


    books = []
    for script_id in LIB.index:

        # Announce
        print("Tokenizing", script_id, LIB.loc[script_id].title)

        # Define vars
        scene_regex = LIB.loc[script_id].scene_regex
        ohco_pats = [('scene', scene_regex, 'm')]
        src_file_path = LIB.loc[script_id].source

        # Create object
        text = TextParser(src_file_path, ohco_pats=ohco_pats, clip_pats=clip_pats, use_nltk=True)

        # Define parameters
        text.verbose = True
        text.strip_hyphens = True
        text.strip_whitespace = True

        # Parse
        text.import_source().parse_tokens();

        # Name things
        text.TOKENS['script_id'] = script_id
        text.TOKENS = text.TOKENS.reset_index().set_index(['script_id'] + text.OHCO)

        # Add to list
        books.append(text.TOKENS)
        
    # Combine into a single dataframe
    CORPUS = pd.concat(books).sort_index()

    # Clean up
    del(books)
    del(text)
        
    print("Done")
        
    return CORPUS
In [17]:
CORPUS = tokenize_collection(LIB)
Tokenizing 1 Asteroid City
Importing  C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_AsteroidCity.txt
Clipping text
Parsing OHCO level 0 scene_id by milestone ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:|INSERT:|CUT TO:|MONTAGE:)
line_str scene_str
Index(['scene_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 2 Bottle Rocket
Importing  C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_BottleRocket.txt
Clipping text
Parsing OHCO level 0 scene_id by milestone ^\s*(EXT\.|INT\.|EXT/INT\.)
line_str scene_str
Index(['scene_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 3 French Dispatch
Importing  C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_FrenchDispatch.txt
Clipping text
Parsing OHCO level 0 scene_id by milestone ^\s*(Obituary|EXT\.|CUT TO:|MONTAGE:|In the File Room:|INT\.|Sketchbook|SPLIT-SCREEN:|Story #1|TITLE:|Story #2|INSERT:|INT/EXT\.|Split-screen:)
line_str scene_str
Index(['scene_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 4 Grand Budapest Hotel
Importing  C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_GrandBudapestHotel.txt
Clipping text
Parsing OHCO level 0 scene_id by milestone ^\s*(EXT\.|INT\.|MONTAGE:|CUT TO:|INSERT:|TITLE:)
line_str scene_str
Index(['scene_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 5 Moonrise Kingdom
Importing  C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_MoonriseKingdom.txt
Clipping text
Parsing OHCO level 0 scene_id by milestone ^\s*(INT\.|EXT\.|TITLES OVER:|CUT TO:|INSERT:|MONTAGE:|CUT TO:|TITLE:)
line_str scene_str
Index(['scene_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 6 Royal Tennenbaums
Importing  C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_RoyalTennenbaums.txt
Clipping text
Parsing OHCO level 0 scene_id by milestone ^\s*(INSERT:|CUT TO:|INT\.|EXT\.|MONTAGE:)
line_str scene_str
Index(['scene_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model
Tokenizing 7 Rushmore
Importing  C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_Project_ddj6tu\data\ANDERSON_Rushmore.txt
Clipping text
Parsing OHCO level 0 scene_id by milestone ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARCH MONTAGE:| OCTOBER MONTAGE:|THANKSGIVING MONTAGE:|DECEMBER MONTAGE:)
line_str scene_str
Index(['scene_str'], dtype='object')
Parsing OHCO level 1 para_num by delimitter \n\n
Parsing OHCO level 2 sent_num by NLTK model
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2802075310.py:132: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  div_lines = self.TOKENS[src_col].str.contains(div_pat, regex=True, case=True)
Parsing OHCO level 3 token_num by NLTK model
Done
In [18]:
CORPUS
Out[18]:
pos_tuple pos token_str term_str
script_id scene_id para_num sent_num token_num
1 1 0 0 0 (Black, NNP) NNP Black black
1 (and, CC) CC and and
2 (white., NN) NN white. white
1 0 0 (A, DT) DT A a
1 (1950's, CD) CD 1950's 1950s
... ... ... ... ... ... ... ... ...
7 118 134 2 15 (everyone, NN) NN everyone everyone
16 (slowly, RB) RB slowly slowly
17 (begins, VBZ) VBZ begins begins
18 (to, TO) TO to to
19 (dance., VB) VB dance. dance

177116 rows × 4 columns

LIB¶

In [19]:
nltk.download('averaged_perceptron_tagger_eng')
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ddj6tu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
Out[19]:
True
In [20]:
LIB['movie_len'] = CORPUS.groupby('script_id').term_str.count()
In [21]:
LIB['n_scenes'] = CORPUS.reset_index()[['script_id','scene_id']]\
    .drop_duplicates()\
    .groupby('script_id').scene_id.count()
In [22]:
LIB
Out[22]:
title years era source scene_regex movie_len n_scenes
script_id
1 Asteroid City 2023 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... 27624 62
2 Bottle Rocket 1996 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(EXT\.|INT\.|EXT/INT\.) 19464 94
3 French Dispatch 2021 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(Obituary|EXT\.|CUT TO:|MONTAGE:|In the Fi... 30868 182
4 Grand Budapest Hotel 2014 middle C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(EXT\.|INT\.|MONTAGE:|CUT TO:|INSERT:|TITLE:) 27185 177
5 Moonrise Kingdom 2012 middle C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|EXT\.|TITLES OVER:|CUT TO:|INSERT:|... 24877 138
6 Royal Tennenbaums 2001 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INSERT:|CUT TO:|INT\.|EXT\.|MONTAGE:) 24939 222
7 Rushmore 1998 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... 21535 118
In [23]:
LIB['movie_len'].mean()
Out[23]:
25213.14285714286

VOCAB¶

In [24]:
CORPUS = CORPUS[CORPUS.term_str != '']
In [25]:
CORPUS = CORPUS[CORPUS.pos != '']
In [26]:
CORPUS['pos_group'] = CORPUS.pos.str[:2]
In [27]:
CORPUS
Out[27]:
pos_tuple pos token_str term_str pos_group
script_id scene_id para_num sent_num token_num
1 1 0 0 0 (Black, NNP) NNP Black black NN
1 (and, CC) CC and and CC
2 (white., NN) NN white. white NN
1 0 0 (A, DT) DT A a DT
1 (1950's, CD) CD 1950's 1950s CD
... ... ... ... ... ... ... ... ... ...
7 118 134 2 15 (everyone, NN) NN everyone everyone NN
16 (slowly, RB) RB slowly slowly RB
17 (begins, VBZ) VBZ begins begins VB
18 (to, TO) TO to to TO
19 (dance., VB) VB dance. dance VB

177087 rows × 5 columns

In [28]:
VOCAB = CORPUS.term_str.value_counts().to_frame('n').sort_index()
VOCAB.index.name = 'term_str'
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['p'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i'] = -np.log2(VOCAB.p)
In [29]:
VOCAB['max_pos'] = CORPUS[['term_str','pos']].value_counts().unstack(fill_value=0).idxmax(1)
In [30]:
VOCAB['max_pos_group'] = CORPUS[['term_str','pos_group']].value_counts().unstack(fill_value=0).idxmax(1)
In [31]:
VOCAB['n_pos'] = CORPUS[['term_str','pos']].value_counts().unstack().count(1)
VOCAB['cat_pos'] = CORPUS[['term_str','pos']].value_counts().to_frame('n').reset_index()\
    .groupby('term_str').pos.apply(lambda x: set(x))
In [32]:
sw = pd.DataFrame(nltk.corpus.stopwords.words('english'), columns=['term_str'])
sw = sw.reset_index().set_index('term_str')
sw.columns = ['dummy']
sw.dummy = 1
In [33]:
VOCAB['stop'] = VOCAB.index.map(sw.dummy)
VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
In [34]:
from nltk.stem.porter import PorterStemmer
stemmer1 = PorterStemmer()
VOCAB['stem_porter'] = VOCAB.apply(lambda x: stemmer1.stem(x.name), 1)

from nltk.stem.snowball import SnowballStemmer
stemmer2 = SnowballStemmer("english")
VOCAB['stem_snowball'] = VOCAB.apply(lambda x: stemmer2.stem(x.name), 1)

from nltk.stem.lancaster import LancasterStemmer
stemmer3 = LancasterStemmer()
VOCAB['stem_lancaster'] = VOCAB.apply(lambda x: stemmer3.stem(x.name), 1)
In [35]:
VOCAB
Out[35]:
n n_chars p i max_pos max_pos_group n_pos cat_pos stop stem_porter stem_snowball stem_lancaster
term_str
1 41 1 0.000232 12.071454 NNP NN 4 {CD, PDT, NN, NNP} 0 1 1 1
10 12 2 0.000068 13.844044 CD CD 4 {CD, VBZ, NN, NNP} 0 10 10 10
100 8 3 0.000045 14.429006 CD CD 1 {CD} 0 100 100 100
10000 1 5 0.000006 17.429006 CD CD 1 {CD} 0 10000 10000 10000
100111 1 6 0.000006 17.429006 CD CD 1 {CD} 0 100111 100111 100111
... ... ... ... ... ... ... ... ... ... ... ... ...
zubrowkian 3 10 0.000017 15.844044 JJ JJ 2 {JJ, NNP} 0 zubrowkian zubrowkian zubrowk
à 3 1 0.000017 15.844044 NN NN 2 {NN, NNP} 0 à à à
éclair 1 6 0.000006 17.429006 NNP NN 1 {NNP} 0 éclair éclair éclair
éclairs 1 7 0.000006 17.429006 NNS NN 1 {NNS} 0 éclair éclair éclairs
école 1 5 0.000006 17.429006 NNP NN 1 {NNP} 0 école école écol

15306 rows × 12 columns

In [36]:
sw = pd.DataFrame({'stop': 1}, index=nltk.corpus.stopwords.words('english'))
sw.index.name='term_str'
In [37]:
if 'stop' not in VOCAB.columns:
    VOCAB = VOCAB.join(sw)
    VOCAB['stop'] = VOCAB['stop'].fillna(0).astype('int')
In [38]:
VOCAB = VOCAB[VOCAB.stop == 0]
In [39]:
OHCO = ['script_id', 'scene_id', 'para_num', 'sent_num', 'token_num']
PARA = OHCO[:3]
CHAP = OHCO[:2]
BOOK = OHCO[:1]
SENT = OHCO[:4]

tf_method = 'max'
bag = SENT
vocab_filter = 'dfidf'
n_terms = 1000

BOW¶

In [40]:
def create_bow(CORPUS, bag, item_type='term_str'):
    BOW = CORPUS.groupby(bag+[item_type])[item_type].count().to_frame('n')
    return BOW
In [41]:
BOW = create_bow(CORPUS, SENT)
In [42]:
BOW
Out[42]:
n
script_id scene_id para_num sent_num term_str
1 1 0 0 and 1
black 1
white 1
1 0 1950s 1
a 1
... ... ... ... ... ...
7 118 134 2 takes 1
the 2
to 1
walks 1
with 1

167833 rows × 1 columns

DTM¶

In [43]:
DTM = BOW.n.unstack(fill_value=0)
In [44]:
DTM
Out[44]:
term_str 1 10 100 10000 100111 101 10111 101111 102 102111 ... zoom zooms zs zubrowka zubrowkaofficially zubrowkian à éclair éclairs école
script_id scene_id para_num sent_num
1 1 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7 118 133 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
134 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

23483 rows × 15306 columns

DFIDF¶

In [45]:
VOCAB['df'] = DTM.astype('bool').sum()
VOCAB['idf'] = np.log2(len(DTM) / VOCAB.df)
VOCAB['dfidf'] = VOCAB.df * VOCAB.idf
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\1021891658.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VOCAB['df'] = DTM.astype('bool').sum()
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\1021891658.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VOCAB['idf'] = np.log2(len(DTM) / VOCAB.df)
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\1021891658.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  VOCAB['dfidf'] = VOCAB.df * VOCAB.idf
In [46]:
VOCAB
Out[46]:
n n_chars p i max_pos max_pos_group n_pos cat_pos stop stem_porter stem_snowball stem_lancaster df idf dfidf
term_str
1 41 1 0.000232 12.071454 NNP NN 4 {CD, PDT, NN, NNP} 0 1 1 1 40 9.197401 367.896040
10 12 2 0.000068 13.844044 CD CD 4 {CD, VBZ, NN, NNP} 0 10 10 10 12 10.934367 131.212399
100 8 3 0.000045 14.429006 CD CD 1 {CD} 0 100 100 100 8 11.519329 92.154633
10000 1 5 0.000006 17.429006 CD CD 1 {CD} 0 10000 10000 10000 1 14.519329 14.519329
100111 1 6 0.000006 17.429006 CD CD 1 {CD} 0 100111 100111 100111 1 14.519329 14.519329
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
zubrowkian 3 10 0.000017 15.844044 JJ JJ 2 {JJ, NNP} 0 zubrowkian zubrowkian zubrowk 3 12.934367 38.803100
à 3 1 0.000017 15.844044 NN NN 2 {NN, NNP} 0 à à à 3 12.934367 38.803100
éclair 1 6 0.000006 17.429006 NNP NN 1 {NNP} 0 éclair éclair éclair 1 14.519329 14.519329
éclairs 1 7 0.000006 17.429006 NNS NN 1 {NNS} 0 éclair éclair éclairs 1 14.519329 14.519329
école 1 5 0.000006 17.429006 NNP NN 1 {NNP} 0 école école écol 1 14.519329 14.519329

15173 rows × 15 columns

In [47]:
VOCAB.dfidf.sort_values(ascending=False).head(20)
Out[47]:
term_str
max        4283.756574
mr         4056.413705
looks      3958.476797
dignan     3914.095042
anthony    3386.125779
royal      3151.175932
gustave    3014.477170
back       2870.570211
one        2689.799238
says       2656.320299
dont       2481.588591
sam        2438.135107
bob        2345.754584
vo         2278.796777
im         2238.218371
blume      2169.891364
pause      2156.118937
zero       2137.699071
suzy       2086.704994
door       2049.298688
Name: dfidf, dtype: float64

TFIDF¶

In [48]:
TFIDF = (DTM.T / DTM.T.max()).T * VOCAB.idf
In [49]:
VIDX = VOCAB[VOCAB.max_pos.isin(['NN','NNS'])].sort_values('dfidf', ascending=False).head(n_terms).index
In [50]:
VOCAB.loc[VIDX].sort_index()
Out[50]:
n n_chars p i max_pos max_pos_group n_pos cat_pos stop stem_porter stem_snowball stem_lancaster df idf dfidf
term_str
2 49 1 0.000278 11.814296 NN NN 5 {NN, NNP, CD, VBP, CC} 0 2 2 2 49 8.904619 436.326344
aback 8 5 0.000045 14.429006 NN NN 3 {JJ, NN, NNP} 0 aback aback aback 8 11.519329 92.154633
accent 12 6 0.000068 13.844044 NN NN 2 {VBZ, NN} 0 accent accent acc 12 10.934367 131.212399
actor 55 5 0.000312 11.647646 NN NN 3 {JJ, NN, NNP} 0 actor actor act 50 8.875473 443.773646
actors 10 6 0.000057 14.107078 NNS NN 3 {NNS, NN, JJ} 0 actor actor act 10 11.197401 111.974010
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
yards 13 5 0.000074 13.728566 NNS NN 2 {NNS, NN} 0 yard yard yard 13 10.818889 140.645562
year 81 4 0.000459 11.089156 NN NN 3 {CD, NN, JJ} 0 year year year 77 8.252543 635.445778
years 73 5 0.000414 11.239182 NNS NN 3 {NNS, NN, JJ} 0 year year year 73 8.329505 608.053832
yet 33 3 0.000187 12.384612 NN NN 7 {NNS, NN, NNP, VBN, VB, RB, CC} 0 yet yet yet 32 9.519329 304.618531
yo 16 2 0.000091 13.429006 NN NN 2 {NN, NNP} 0 yo yo yo 9 11.349404 102.144637

1000 rows × 15 columns

Reduced TFIDF¶

In [51]:
TFIDF_RED = TFIDF[VIDX]
In [52]:
TFIDF_RED
Out[52]:
term_str vo pause door room nods hand front hands man right ... text shoots studio warning aback coldly pipes candle observatory pressure
script_id scene_id para_num sent_num
1 1 0 0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 ... 0.0 0.0 11.519329 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7 118 133 0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0
134 0 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 6.48041 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.00000 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0

23483 rows × 1000 columns

PCA¶

In [53]:
from sklearn.decomposition import PCA
from scipy.linalg import norm
import plotly_express as px
import seaborn as sns
In [54]:
n_comps = 5
pc_cols = [f"PC{i}" for i in range(n_comps)]
pca_engine = PCA(n_components=n_comps)
In [55]:
DCM = pd.DataFrame(pca_engine.fit_transform(TFIDF.fillna(0)), index=TFIDF.index)
DCM.columns = pc_cols # ['PC{}'.format(i) for i in DCM.columns]
DCM = DCM.join(LIB, on='script_id')
DCM['doc'] = DCM.apply(lambda x: "{}-{}".format(x.title, x.name[1]), 1)
In [56]:
LOADINGS = pd.DataFrame(pca_engine.components_.T * np.sqrt(pca_engine.explained_variance_))
LOADINGS.columns = ["PC{}".format(i) for i in LOADINGS.columns]
LOADINGS.index = TFIDF.columns
LOADINGS.index.name = 'term_str'
LOADINGS = LOADINGS.join(VOCAB)
In [57]:
def vis_pcs(a=0, b=1, label='title', hover_name='doc', symbol=None, size=None):
    return px.scatter(DCM, f"PC{a}", f"PC{b}", 
                    color=label, hover_name=hover_name, 
                     symbol=symbol, size=size,
                     marginal_x='box', height=800)
In [58]:
vis_pcs(0, 1)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning: When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.
  sf: grouped.get_group(s if len(s) > 1 else s[0])
In [59]:
vis_pcs(2, 3)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning:

When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.

In [60]:
vis_pcs(2, 3)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning:

When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.

In [61]:
def vis_loadings(a=0, b=1, hover_name='term_str'):
    return px.scatter(LOADINGS.reset_index(), f"PC{a}", f"PC{b}", 
                      text='term_str', hover_name='term_str',
                      size='dfidf', color='max_pos_group', 
                      marginal_x='box', height=800)
In [62]:
vis_loadings(0, 1)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning:

When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.

In [63]:
vis_loadings(2, 3)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning:

When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.

In [64]:
top_terms_sk= {}

data = []
for i in range(n_comps):
    for j in [0, 1]:
        data.append((f"PC{i}", j, ' '.join(LOADINGS.sort_values(f'PC{i}', ascending=bool(j)).head(10).index.to_list())))

comp_strs = pd.DataFrame(data)
comp_strs.columns =  ['pc', 'pole', 'top_terms']
comp_strs = comp_strs.set_index(['pc', 'pole'])
In [65]:
COMPS = comp_strs
In [66]:
COMPS
Out[66]:
top_terms
pc pole
PC0 0 scout master ward captain sharp bishop mr mrs ...
1 dignan anthony bob max royal cross miss dont k...
PC1 0 mr blume bishop henry moustafa max mrs fischer...
1 dignan scout master ward anthony dont bob roya...
PC2 0 dignan anthony bob mr dont know henry blume ge...
1 miss cross max gustave zero says sam eyes sits...
PC3 0 miss cross dignan anthony bob max looks master...
1 vo gustave captain sharp royal wright roebuck ...
PC4 0 max looks back royal anthony richie around cha...
1 vo cross miss wright roebuck dont captain shar...

LDA¶

In [67]:
DOCS = CORPUS[CORPUS.pos.str.match(r'^NNS?$')]\
    .groupby(SENT).term_str\
    .apply(lambda x: ' '.join(x))\
    .to_frame()\
    .rename(columns={'term_str':'doc_str'})
In [68]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import plotly_express as px
In [69]:
ngram_range = (1, 2)
n_terms = 4000
n_topics = 20
max_iter = 5
n_top_terms = 9
In [70]:
count_engine = CountVectorizer(max_features=n_terms, ngram_range=ngram_range, stop_words='english')
count_model = count_engine.fit_transform(DOCS.doc_str)
TERMS = count_engine.get_feature_names_out()
In [71]:
VOCAB2 = pd.DataFrame(index=TERMS)
VOCAB2.index.name = 'term_str'
In [72]:
DTM2 = pd.DataFrame(count_model.toarray(), index=DOCS.index, columns=TERMS)
In [73]:
VOCAB2['doc_count'] = DTM2.astype('bool').astype('int').sum()
DOCS['term_count'] = DTM2.sum(1)
In [74]:
lda_engine = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)
In [75]:
TNAMES = [f"T{str(x).zfill(len(str(n_topics)))}" for x in range(n_topics)]
In [76]:
lda_model = lda_engine.fit_transform(count_model)
In [77]:
THETA = pd.DataFrame(lda_model, index=DOCS.index)
THETA.columns.name = 'topic_id'
THETA.columns = TNAMES
In [78]:
PHI = pd.DataFrame(lda_engine.components_, columns=TERMS, index=TNAMES)
PHI.index.name = 'topic_id'
PHI.columns.name = 'term_str'
In [116]:
THETA
Out[116]:
T00 T01 T02 T03 T04 T05 T06 T07 T08 T09 T10 T11 T12 T13 T14 T15 T16 T17 T18 T19
script_id scene_id para_num sent_num
1 1 0 0 0.525000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000
1 0 0.016667 0.016667 0.016667 0.016667 0.016667 0.455268 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.244732 0.016667 0.016667
1 0.016667 0.016667 0.016667 0.683333 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667
2 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.525000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000
3 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.810000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7 118 132 2 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.525000
133 0 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.525000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000
1 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.683333 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667
134 1 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.525000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000
2 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.207715 0.612285

18938 rows × 20 columns

In [79]:
TOPICS = PHI.stack().groupby('topic_id')\
    .apply(lambda x: ' '.join(x.sort_values(ascending=False).head(n_top_terms).reset_index().term_str))\
    .to_frame('top_terms')
TOPICS
Out[79]:
top_terms
topic_id
T00 eyes smiles end telephone pocket guy hes mitch...
T01 door minute street manager play mouth motel wi...
T02 day girls troop rest record looks note express...
T03 way years box water author children arm father...
T04 face ground year group desk life place actor s...
T05 book girl hair watch eye office radio cover case
T06 hands car man glass frowns doors pair elevator...
T07 look sir mother bed sound jacket set coffee ma...
T08 school quietly thing men booth mirror seat she...
T09 silence page house sorry bathroom good home sa...
T10 room shrugs bottle away shirt glasses waiter t...
T11 hand time table woman middle sky business job ...
T12 pause nods scout course feet stares margaret b...
T13 right floor morning roof night light fingers c...
T14 window head cigarette station picture point ha...
T15 people lights youre building alien second scou...
T16 boy points tent wall doorway work paper room dont
T17 max vo air space family dirk huh friend breath
T18 zero hesitates walks sidewalk guard distance m...
T19 voice corner kind arms moment yeah foot sighs lot
In [80]:
THE_LIB = THETA.join(LIB)

pca = PCA(n_components=2)
pc = pca.fit_transform(PHI)
pca_df = pd.DataFrame(data=pc, columns=['PC1', 'PC2'], index=PHI.index)

mean_topic_weight = THETA.mean(axis=0)
mean_topic_weight.name = 'MeanWeight'
topic_info = pca_df.join(mean_topic_weight)

def get_topic_title(topic_name, the_lib, title_column='title'):
    relevant_doc = the_lib.nlargest(1, topic_name)
    if not relevant_doc.empty:
        return relevant_doc[title_column].iloc[0]
    return None
In [117]:
THE_LIB
Out[117]:
T00 T01 T02 T03 T04 T05 T06 T07 T08 T09 ... T17 T18 T19 title years era source scene_regex movie_len n_scenes
script_id scene_id para_num sent_num
1 1 0 0 0.525000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 ... 0.025000 0.025000 0.025000 Asteroid City 2023 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... 27624 62
1 0 0.016667 0.016667 0.016667 0.016667 0.016667 0.455268 0.016667 0.016667 0.016667 0.016667 ... 0.244732 0.016667 0.016667 Asteroid City 2023 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... 27624 62
1 0.016667 0.016667 0.016667 0.683333 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 ... 0.016667 0.016667 0.016667 Asteroid City 2023 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... 27624 62
2 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.525000 0.025000 0.025000 0.025000 ... 0.025000 0.025000 0.025000 Asteroid City 2023 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... 27624 62
3 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 ... 0.010000 0.010000 0.810000 Asteroid City 2023 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... 27624 62
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7 118 132 2 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 ... 0.025000 0.025000 0.525000 Rushmore 1998 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... 21535 118
133 0 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 ... 0.025000 0.025000 0.025000 Rushmore 1998 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... 21535 118
1 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 0.016667 ... 0.016667 0.016667 0.016667 Rushmore 1998 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... 21535 118
134 1 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 0.025000 ... 0.025000 0.025000 0.025000 Rushmore 1998 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... 21535 118
2 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 0.010000 ... 0.010000 0.207715 0.612285 Rushmore 1998 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... 21535 118

18938 rows × 27 columns

In [81]:
topic_titles = {}
for topic in PHI.index:
    topic_titles[topic] = get_topic_title(topic, THE_LIB)  

topic_title_series = pd.Series(topic_titles, name='title')
topic_df_vis = topic_info.join(topic_title_series)
topic_df_vis = topic_df_vis.dropna(subset=['title'])
In [82]:
def vis_loadings(a=1, b=2, hover_name=topic_df_vis.index.name or 'topic'):
    pc_x = f"PC{a}"
    pc_y = f"PC{b}"
    return px.scatter(topic_df_vis.reset_index(), x=pc_x, y=pc_y,
                      text=topic_df_vis.index,  
                      hover_name=hover_name,
                      size='MeanWeight',
                      color='title',
                      marginal_x='box',
                      height=800,
                      title=f"PCA of Topics (PC{a} vs PC{b})",
                      labels={'PC1': 'Principal Component 1',
                              'PC2': 'Principal Component 2',
                              'MeanWeight': 'Mean Topic Weight',
                              'title': 'Dominant Title'}) 

fig = vis_loadings(1, 2) 
fig.show()
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning:

When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.

Sentiment¶

In [83]:
salex_csv = 'C:\\Users\\ddj6tu\\Documents\\GitHub\\DS5001\\Final_Project_ddj6tu\\data\\salex_nrc.csv'
SALEX = pd.read_csv(salex_csv).set_index('term_str')
SALEX.columns = [col.replace('nrc_','') for col in SALEX.columns]
In [84]:
SALEX
Out[84]:
anger anticipation disgust fear joy negative positive sadness surprise trust sentiment
term_str
abandon 0 0 0 1 0 1 0 1 0 0 -1
abandoned 1 0 0 1 0 1 0 1 0 0 -1
abandonment 1 0 0 1 0 1 0 1 1 0 -1
abduction 0 0 0 1 0 1 0 1 1 0 -1
aberration 0 0 1 0 0 1 0 0 0 0 -1
... ... ... ... ... ... ... ... ... ... ... ...
young 0 1 0 0 1 0 1 0 1 0 1
youth 1 1 0 1 1 0 1 0 1 0 1
zeal 0 1 0 0 1 0 1 0 1 1 1
zealous 0 0 0 0 1 0 1 0 0 1 1
zest 0 1 0 0 1 0 1 0 0 1 1

3688 rows × 11 columns

In [85]:
VOCAB_SENT = pd.concat([VOCAB.reset_index().set_index('term_str'), SALEX], join='inner', axis=1)
In [86]:
VOCAB_SENT
Out[86]:
n n_chars p i max_pos max_pos_group n_pos cat_pos stop stem_porter ... anticipation disgust fear joy negative positive sadness surprise trust sentiment
term_str
abandon 3 7 0.000017 15.844044 VB VB 1 {VB} 0 abandon ... 0 0 1 0 1 0 1 0 0 -1
abandoned 6 9 0.000034 14.844044 VBD VB 3 {JJ, VBN, VBD} 0 abandon ... 0 0 1 0 1 0 1 0 0 -1
absence 1 7 0.000006 17.429006 JJ JJ 1 {JJ} 0 absenc ... 0 0 1 0 1 0 1 0 0 -1
absent 1 6 0.000006 17.429006 NN NN 1 {NN} 0 absent ... 0 0 0 0 1 0 1 0 0 -1
abuse 1 5 0.000006 17.429006 NN NN 1 {NN} 0 abus ... 0 1 1 0 1 0 1 0 0 -1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
wound 5 5 0.000028 15.107078 NN NN 1 {NN} 0 wound ... 0 0 1 0 1 0 1 0 0 -1
wreck 1 5 0.000006 17.429006 NN NN 1 {NN} 0 wreck ... 0 1 1 0 1 0 1 1 0 -1
yell 3 4 0.000017 15.844044 VBP VB 3 {VB, NN, VBP} 0 yell ... 0 0 1 0 1 0 0 1 0 -1
young 88 5 0.000499 10.969575 JJ JJ 5 {JJ, NN, NNP, VB, VBP} 0 young ... 1 0 0 1 0 1 0 1 0 1
youth 4 5 0.000023 15.429006 NN NN 2 {NN, NNP} 0 youth ... 1 0 1 1 0 1 0 1 0 1

1361 rows × 26 columns

In [87]:
emo_cols = "anger anticipation disgust fear joy sadness surprise trust sentiment".split()
BOW_SENT = BOW.join(VOCAB_SENT[['max_pos'] + emo_cols], on='term_str', rsuffix='_v').dropna()
In [88]:
BOW_SENT
Out[88]:
n max_pos anger anticipation disgust fear joy sadness surprise trust sentiment
script_id scene_id para_num sent_num term_str
1 1 0 0 black 1 JJ 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 -1.0
white 1 JJ 0.0 1.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0
1 5 addresses 1 VBZ 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
4 2 authentic 1 JJ 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0
fabrication 1 NNP 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 -1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7 118 125 0 dance 1 NN 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0
132 0 music 1 NN 0.0 0.0 0.0 0.0 1.0 1.0 0.0 0.0 1.0
134 0 cross 1 NNP 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 -1.0
1 cross 1 NNP 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 -1.0
2 dance 2 NN 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0

8237 rows × 11 columns

In [89]:
DOC_SENT = BOW_SENT.pivot_table(index='script_id', columns='term_str', values=['anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust', 'sentiment'], fill_value=0)


DOC_SENT
Out[89]:
anger ... trust
term_str abandon abandoned absence absent abuse abyss academic accident accidental accompaniment ... worry worrying worse worship worthless wound wreck yell young youth
script_id
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
5 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
6 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
7 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

7 rows × 12249 columns

In [90]:
DOC_SENT_SENTIMENT = DOC_SENT['sentiment']


SENT_DF = pd.merge(DOC_SENT_SENTIMENT, LIB, left_index=True, right_on='script_id', how='inner')
SENT_DF
Out[90]:
abandon abandoned absence absent abuse abyss academic accident accidental accompaniment ... yell young youth title_y years era source scene_regex movie_len n_scenes
script_id
1 -1.0 0.0 0.0 0.0 0.0 0.0 0.0 -1.0 0.0 1.0 ... 0.0 1.0 0.0 Asteroid City 2023 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|TITLE SEQUENCE:|EXT\.|SPLIT-SCREEN:... 27624 62
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -1.0 0.0 ... -1.0 1.0 0.0 Bottle Rocket 1996 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(EXT\.|INT\.|EXT/INT\.) 19464 94
3 0.0 -1.0 0.0 0.0 0.0 0.0 0.0 -1.0 0.0 0.0 ... 0.0 1.0 1.0 French Dispatch 2021 late C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(Obituary|EXT\.|CUT TO:|MONTAGE:|In the Fi... 30868 182
4 0.0 -1.0 -1.0 -1.0 0.0 -1.0 0.0 0.0 0.0 0.0 ... -1.0 1.0 0.0 Grand Budapest Hotel 2014 middle C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(EXT\.|INT\.|MONTAGE:|CUT TO:|INSERT:|TITLE:) 27185 177
5 0.0 0.0 0.0 0.0 -1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 Moonrise Kingdom 2012 middle C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|EXT\.|TITLES OVER:|CUT TO:|INSERT:|... 24877 138
6 0.0 -1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... -1.0 1.0 0.0 Royal Tennenbaums 2001 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INSERT:|CUT TO:|INT\.|EXT\.|MONTAGE:) 24939 222
7 0.0 0.0 0.0 0.0 0.0 0.0 1.0 -1.0 0.0 0.0 ... 0.0 1.0 0.0 Rushmore 1998 early C:\Users\ddj6tu\Documents\GitHub\DS5001\Final_... ^\s*(INT\.|CUT TO:|TITLE:|EXT\.|INSERT|RESEARC... 21535 118

7 rows × 1368 columns

In [91]:
grouped_df = SENT_DF.groupby('years')

sentiment_by_year = grouped_df.apply(lambda x: x.iloc[:, 0:1361].apply(pd.to_numeric, errors='coerce').mean()).reset_index()

fig = px.line(
    sentiment_by_year,
    x='years',  
    y=sentiment_by_year.columns[1], 
    title='Mean Sentiment by Year',
    labels={'years': 'Year'},  
    markers=True  
)

fig.show()
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\3179877830.py:3: DeprecationWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

Word2Vec¶

In [92]:
docs = CORPUS[~CORPUS.pos.str.match('NNPS?')].dropna(subset=['term_str'])\
    .groupby(SENT)\
    .term_str.apply(lambda  x:  x.tolist())\
    .reset_index()['term_str'].tolist()
docs = [doc for doc in docs if len(doc) > 1] 
In [93]:
import pandas as pd
import numpy as np
from gensim.models import word2vec
from gensim.corpora import Dictionary
from sklearn.manifold import TSNE as tsne
import plotly_express as px
In [94]:
import gensim
gensim.__version__
Out[94]:
'4.3.3'
In [95]:
# word2vec parameters
w2v_params = dict(
    window = 5,
    vector_size = 246,
    min_count = 50, # THIS LIMITS OUR VOCAB
    workers = 4
)
In [96]:
vocab = Dictionary(docs)
In [97]:
model = word2vec.Word2Vec(docs, **w2v_params)
In [98]:
vocab = model.wv.index_to_key 

word_vectors = model.wv.get_normed_vectors()

W2V = pd.DataFrame(word_vectors, index=vocab)

W2V
Out[98]:
0 1 2 3 4 5 6 7 8 9 ... 236 237 238 239 240 241 242 243 244 245
the 0.061662 -0.114587 -0.013411 0.049237 -0.019735 -0.048255 0.011538 -0.044835 -0.169090 -0.063417 ... 0.012361 0.024137 -0.008331 -0.039298 -0.005002 -0.054072 -0.043507 -0.027745 0.083137 0.076604
a 0.045642 -0.095899 0.000209 0.043699 0.002286 -0.059206 0.012981 -0.059315 -0.185802 -0.079814 ... 0.010301 -0.000763 0.016885 -0.034742 0.007390 -0.057769 -0.047956 -0.042989 0.054543 0.066632
and 0.063052 -0.129563 -0.008656 0.043561 0.000264 -0.060959 -0.018276 -0.054806 -0.186765 -0.064324 ... 0.002699 0.025620 0.002572 -0.032011 0.002405 -0.064659 -0.030632 -0.025463 0.082791 0.063598
of 0.041239 -0.110470 0.003377 0.037904 0.006403 -0.070319 -0.015618 -0.074511 -0.188593 -0.081859 ... -0.001532 -0.017108 0.030993 -0.043745 0.007583 -0.073758 -0.030773 -0.032773 0.051215 0.073910
to -0.005363 -0.045059 0.021593 0.013028 0.002145 -0.048232 -0.129133 0.006744 -0.083075 -0.009304 ... -0.048809 0.083383 0.021390 0.002768 0.041702 -0.069906 -0.009895 -0.027302 0.030008 0.024399
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
every 0.015675 -0.073669 0.023438 0.022841 -0.003873 -0.057982 -0.094052 -0.020734 -0.140234 -0.046660 ... -0.033663 0.072350 0.013848 -0.019384 0.028436 -0.077645 -0.008306 -0.035662 0.051916 0.040544
let -0.027250 -0.008303 0.034657 0.001060 -0.000434 -0.041714 -0.118841 0.016291 -0.061241 -0.012690 ... -0.050286 0.084009 0.018705 0.005458 0.044029 -0.064054 0.009572 -0.034025 0.014033 0.011031
ten 0.019050 -0.082811 0.020524 0.026385 0.000223 -0.060145 -0.085252 -0.025562 -0.146130 -0.044765 ... -0.028616 0.069053 0.017981 -0.018235 0.029832 -0.076593 -0.011033 -0.034493 0.052572 0.043054
chair 0.038542 -0.115938 0.007069 0.038383 -0.002248 -0.061542 -0.050139 -0.044249 -0.179698 -0.057000 ... -0.010139 0.044378 0.015401 -0.029592 0.013941 -0.073150 -0.019933 -0.030727 0.069193 0.053102
etheline 0.021201 -0.087620 0.017109 0.027859 -0.004503 -0.058980 -0.090568 -0.028105 -0.151700 -0.045393 ... -0.033249 0.070590 0.016367 -0.020575 0.026713 -0.078587 -0.010485 -0.031704 0.057964 0.044489

330 rows × 246 columns

In [99]:
def get_vector(row):
    w = row.name
    try:
        vec = model.wv[w]
    except KeyError as e:
        vec = None
    return vec
In [100]:
WV = pd.DataFrame(VOCAB.apply(get_vector, axis=1).dropna()).apply(lambda x: pd.Series(x[0]), axis=1)
In [101]:
tsne_engine = tsne(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
In [102]:
tsne_model = tsne_engine.fit_transform(WV.to_numpy())
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\sklearn\manifold\_t_sne.py:1164: FutureWarning:

'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.

In [103]:
TSNE = pd.DataFrame(tsne_model, columns=['x','y'], index=WV.index)
In [104]:
X = TSNE.join(VOCAB, how='left')
In [105]:
px.scatter(X.reset_index(), 'x', 'y', 
           text='term_str', 
           color='max_pos', 
           hover_name='term_str',          
           size='dfidf',
           height=1000).update_traces(
                mode='markers+text', 
                textfont=dict(color='black', size=14, family='Arial'),
                textposition='top center')
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning:

When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.

Visualization 1¶

In [106]:
EMO_BOOKS = BOW_SENT.groupby(['script_id'])[emo_cols].mean()
EMO_CHAPS = BOW_SENT.groupby(['script_id','scene_id'])[emo_cols].mean()
In [107]:
EMO_BOOKS.index = LIB.title
In [108]:
EMO_BOOKS.plot.barh(figsize=(15,30));
plt.savefig('Vis1.png')

Visualization 2¶

In [109]:
ERAS = LIB.groupby('era')[['movie_len', 'n_scenes']].mean()
In [ ]:
 
In [110]:
ERAS.style.background_gradient()
Out[110]:
  movie_len n_scenes
era    
early 21979.333333 144.666667
late 29246.000000 122.000000
middle 26031.000000 157.500000

Visualization 3¶

In [118]:
VOCAB['n2'] = DTM.sum()
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\3643906384.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [119]:
VOCAB['p2'] = VOCAB.n2 / VOCAB.n2.sum()
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\2070894287.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [120]:
VOCAB['i2'] = -np.log2(VOCAB.p2)
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\208349894.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [122]:
VOCAB['dp'] = VOCAB.df / len(LIB)
VOCAB['di'] = np.log2(1/VOCAB.dp)
VOCAB['dh'] = VOCAB.dp * VOCAB.di
C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\3670820122.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\3670820122.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

C:\Users\ddj6tu\AppData\Local\Temp\ipykernel_31684\3670820122.py:3: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [124]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=False)

px.scatter(VOCAB.reset_index(), x='i2', y='dfidf', 
           hover_name='term_str', hover_data=['n'],
           color='max_pos', 
           height=500, width=800)
C:\Users\ddj6tu\AppData\Local\anaconda3.1\Lib\site-packages\plotly\express\_core.py:1979: FutureWarning:

When grouping with a length-1 list-like, you will need to pass a length-1 tuple to get_group in a future version of pandas. Pass `(name,)` instead of `name` to silence this warning.

In [ ]:
 
In [ ]:
 

Files¶

In [113]:
output_dir = "C:/Users/ddj6tu/Documents/GitHub\DS5001/Final_Project_ddj6tu/output"
data_prefix = "ANDERSON"
In [114]:
LIB.to_csv(f"{output_dir}/{data_prefix}-LIB.csv")
CORPUS.to_csv(f"{output_dir}/{data_prefix}-CORPUS.csv")
VOCAB.to_csv(f"{output_dir}/{data_prefix}-VOCAB.csv")
BOW.to_csv(f"{output_dir}/{data_prefix}-BOW.csv")
DTM.to_csv(f"{output_dir}/{data_prefix}-DTM.csv")
TFIDF.to_csv(f"{output_dir}/{data_prefix}-TFIDF.csv")
TFIDF_RED.to_csv(f"{output_dir}/{data_prefix}-TFIDF_RED.csv")
COMPS.to_csv(f"{output_dir}/{data_prefix}-COMPS.csv")
DCM.to_csv(f"{output_dir}/{data_prefix}-DCM.csv")
LOADINGS.to_csv(f"{output_dir}/{data_prefix}-LOADINGS.csv")
TOPICS.to_csv(f"{output_dir}/{data_prefix}-TOPICS.csv")
THETA.to_csv(f"{output_dir}/{data_prefix}-THETA.csv")
PHI.to_csv(f"{output_dir}/{data_prefix}-PHI.csv")
VOCAB_SENT.to_csv(f"{output_dir}/{data_prefix}-VOCAB_SENT.csv")
BOW_SENT.to_csv(f"{output_dir}/{data_prefix}-BOW_SENT.csv")
DOC_SENT.to_csv(f"{output_dir}/{data_prefix}-DOC_SENT.csv")
W2V.to_csv(f"{output_dir}/{data_prefix}-W2V.csv")
In [ ]: